# Load needed packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(colorspace)
library(bipartite)  
## Warning: package 'bipartite' was built under R version 4.3.3
## Loading required package: vegan
## Warning: package 'vegan' was built under R version 4.3.3
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-6.1
## Loading required package: sna
## Warning: package 'sna' was built under R version 4.3.3
## Loading required package: statnet.common
## 
## Attaching package: 'statnet.common'
## The following objects are masked from 'package:base':
## 
##     attr, order
## Loading required package: network
## 
## 'network' 1.18.2 (2023-12-04), part of the Statnet Project
## * 'news(package="network")' for changes since last version
## * 'citation("network")' for citation information
## * 'https://statnet.org' for help, support, and other information
## sna: Tools for Social Network Analysis
## Version 2.8 created on 2024-09-07.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
##  For citation information, type citation("sna").
##  Type help(package="sna") to get started.
##  This is bipartite 2.20.
##  For latest changes see versionlog in ?"bipartite-package". For citation see: citation("bipartite").
##  Have a nice time plotting and analysing two-mode networks.
## 
## Attaching package: 'bipartite'
## The following object is masked from 'package:vegan':
## 
##     nullmodel
library(tidyr)
library(readr)
library(vegan)

# setwd("/Users/shirn/OneDrive/Documents/master/data")
setwd("/Users/shirnehoray/EDA/plant_pollinators_prediction/EDA_Eupoll")


# Read the data
data <- read.csv("Interaction_data.csv", encoding = "latin1")

# Get unique species lists for each network
species_per_network <- data %>%
  group_by(Bioregion, Network_id) %>%
  summarise(
    Plants = list(unique(Plant_accepted_name)),
    Pollinators = list(unique(Pollinator_accepted_name)),
    .groups = "drop"
  )

# Function to calculate Jaccard similarity
jaccard_similarity <- function(set1, set2) {
  inter <- length(intersect(set1, set2))
  union <- length(union(set1, set2))
  if (union == 0) return(NA) else return(inter / union)
}

# Loop over each bioregion and create heatmap for each one
bioregions <- unique(species_per_network$Bioregion)
# For plants species 

for (bio in bioregions) {
  cat("Plotting Bioregion:", bio, "\n")
  
  sub_data <- species_per_network %>% filter(Bioregion == bio)
  nets <- sub_data$Network_id
  
  # Build similarity matrix
  mat <- matrix(NA, nrow = length(nets), ncol = length(nets),
                dimnames = list(nets, nets))
  for (i in seq_along(nets)) {
    for (j in seq_along(nets)) {
      mat[i,j] <- jaccard_similarity(sub_data$Plants[[i]], sub_data$Plants[[j]])
    }
  }
  
  # Turn into data frame 
  mat_df <- as.data.frame(as.table(mat))
  
  # Plot heatmap
  p <- ggplot(mat_df, aes(Var1, Var2, fill = Freq)) +
    geom_tile()  +
    scale_fill_continuous_sequential(palette = "Viridis",) +
    theme_minimal() +
    labs(title = paste("Plant species similarity -", bio),
         x = "Network_id", y = "Network_id", fill = "Jaccard Index") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 3),
          axis.text.y = element_text(size = 3))
  
  
  print(p)
}
## Plotting Bioregion: Alpine

## Plotting Bioregion: Atlantic

## Plotting Bioregion: Boreal

## Plotting Bioregion: Continental

## Plotting Bioregion: Mediterranean

## Plotting Bioregion: Pannonian

## Plotting Bioregion: Steppic

# For pollinators species 

for (bio in bioregions) {
  cat("Plotting Bioregion:", bio, "\n")
  
  sub_data <- species_per_network %>% filter(Bioregion == bio)
  nets <- sub_data$Network_id
  
  # Build similarity matrix
  mat <- matrix(NA, nrow = length(nets), ncol = length(nets),
                dimnames = list(nets, nets))
  for (i in seq_along(nets)) {
    for (j in seq_along(nets)) {
      mat[i,j] <- jaccard_similarity(sub_data$Pollinators[[i]], sub_data$Pollinators[[j]])
    }
  }
  
  # Turn into data frame
  mat_df <- as.data.frame(as.table(mat))
  
  # Plot heatmap
  p <- ggplot(mat_df, aes(Var1, Var2, fill = Freq)) +
    geom_tile() +
    scale_fill_continuous_sequential(palette = "Viridis") +
    theme_minimal() +
    labs(title = paste("Pollinators species similarity -", bio),
         x = "Network_id", y = "Network_id", fill = "Jaccard Index") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 3),
          axis.text.y = element_text(size = 3))
  
  
  print(p)
}
## Plotting Bioregion: Alpine

## Plotting Bioregion: Atlantic

## Plotting Bioregion: Boreal

## Plotting Bioregion: Continental

## Plotting Bioregion: Mediterranean

## Plotting Bioregion: Pannonian

## Plotting Bioregion: Steppic

bioregions <- c("Atlantic", "Boreal", "Continental", "Alpine", "Mediterranean", "Steppic", "Pannonian")
min_total <- 15     # minimum total species per network
top_n_pairs <- 100   # how many top pairs to keep (optional)


## =========Utilities===========

normalize_name <- function(x) {
  # Optional: standardize species names (trim, collapse spaces, lower-case)
  x <- trimws(x)
  x <- gsub("\\s+", " ", x)
  tolower(x)
}

jaccard_sets <- function(a, b) {
  a <- unique(a); b <- unique(b)
  ia <- length(intersect(a, b))
  ua <- length(union(a, b))
  if (ua == 0) NA_real_ else ia / ua
}

## ========= Loop for each bioregions===========

for (bio in bioregions) {
  cat(sprintf("\n=== Processing Bioregion: %s ===\n", bio))

  ## ======== Set Columns ==========

  req_cols <- c("Bioregion", "Network_id", "Plant_accepted_name", "Pollinator_accepted_name")
  missing_cols <- setdiff(req_cols, names(data))
  if (length(missing_cols) > 0) {
    cat(paste("Missing required columns:", paste(missing_cols, collapse = ", "), "\n"))
    next
  }

  ## ========= Set the chosen bioregion ========

  dat_bio <- data[data$Bioregion == bio, , drop = FALSE]

  if (nrow(dat_bio) == 0) {
    cat(paste("No rows for Bioregion:", bio, "\n"))
    next
  }

  ## ===== species counts per network (to filter small networks) ====
  
  # Distinct plants per network
  plants_by_net <- aggregate(Plant_accepted_name ~ Network_id, data = dat_bio,
                             FUN = function(x) length(unique(x)))
  names(plants_by_net)[2] <- "n_plants"

  # Distinct pollinators per network
  polls_by_net <- aggregate(Pollinator_accepted_name ~ Network_id, data = dat_bio,
                            FUN = function(x) length(unique(x)))
  names(polls_by_net)[2] <- "n_pollinators"

  # Merge counts and compute total
  counts <- merge(plants_by_net, polls_by_net, by = "Network_id", all = TRUE)
  counts$n_plants[is.na(counts$n_plants)] <- 0
  counts$n_pollinators[is.na(counts$n_pollinators)] <- 0
  counts$total_species <- counts$n_plants + counts$n_pollinators

  # Keep networks meeting the threshold
  nets_keep <- counts$Network_id[counts$total_species >= min_total]
  if (length(nets_keep) == 0) {
    cat(paste("No networks with total_species >=", min_total, "in", bio, "\n"))
    next
  }

  ## ====== Build species sets per network (plants + pollinators together) =====
  # Subset to kept networks
  dat_bio_sub <- dat_bio[dat_bio$Network_id %in% nets_keep, , drop = FALSE]

  # Normalize names (optional but recommended)
  plant_names <- normalize_name(dat_bio_sub$Plant_accepted_name)
  poll_names  <- normalize_name(dat_bio_sub$Pollinator_accepted_name)

  # Tag guilds to avoid accidental collisions
  plant_tags <- paste0("PLANT::", plant_names)
  poll_tags  <- paste0("POLL::",  poll_names)

  # Split row indices by Network_id so we can collect species per network
  idx_by_net <- split(seq_len(nrow(dat_bio_sub)), dat_bio_sub$Network_id)

  # For each network, take the union of unique plant & pollinator tags
  species_items <- lapply(idx_by_net, function(idx) {
    unique(c(plant_tags[idx], poll_tags[idx]))
  })
  # 'species_items' is a named list: names(species_items) are Network_id values

  net_ids <- names(species_items)
  n <- length(net_ids)

  ## ======== Pairwise Jaccard matrix =====
  cat(sprintf("Number of networks for %s: %d\n", bio, n)) # Diagnostic
  sim_mat <- matrix(NA_real_, nrow = n, ncol = n, dimnames = list(net_ids, net_ids))

  for (i in seq_len(n)) {
    # diagonal
    sim_mat[i, i] <- NA_real_
    # upper triangle only
    if (i < n) {
      for (j in (i + 1L):n) {
        val <- jaccard_sets(species_items[[i]], species_items[[j]])
        sim_mat[i, j] <- val
        sim_mat[j, i] <- val
      }
    }
  }

  ## ===== Tidy table of unique pairs =====
  # Convert matrix to data.frame of pairs
  # keep only i<j to avoid duplicates
  pairs_list <- list()
  k <- 0L
  for (i in seq_len(n)) {
    if (i < n) {
      for (j in (i + 1L):n) {
        k <- k + 1L
        pairs_list[[k]] <- data.frame(
          Net1 = net_ids[i],
          Net2 = net_ids[j],
          Jaccard = sim_mat[i, j],
          stringsAsFactors = FALSE
        )
      }
    }
  }
  sim_df <- do.call(rbind, pairs_list)
  if (is.null(sim_df) || nrow(sim_df) == 0 || !("Jaccard" %in% names(sim_df)) || all(is.na(sim_df$Jaccard))) {
    cat(sprintf("No valid Jaccard similarities for %s (empty or all NA).\n", bio))
    next
  }

  # Remove NA and sort descending
  sim_df <- sim_df[!is.na(sim_df$Jaccard), , drop = FALSE]
  ord <- order(sim_df$Jaccard, decreasing = TRUE)
  sim_df <- sim_df[ord, , drop = FALSE]

  # Keep top N pairs (optional)
  if (!is.null(top_n_pairs) && is.finite(top_n_pairs)) {
    top_n <- min(top_n_pairs, nrow(sim_df))
    sim_top <- sim_df[seq_len(top_n), , drop = FALSE]
  } else {
    sim_top <- sim_df
  }

  ## ======Results =======
  cat(sprintf("\n=== Top similar network pairs for %s ===\n", bio))
  print(head(sim_top, 20), row.names = FALSE)

  ## ======= Heatmap of Jaccard similarities ======
  # Convert sim_mat to long format for ggplot
  sim_mat_long <- as.data.frame.table(sim_mat, responseName = "Jaccard")
  names(sim_mat_long)[1:2] <- c("Net1", "Net2")
  sim_mat_long <- sim_mat_long[!is.na(sim_mat_long$Jaccard), ]

  # # Create heatmap
  # p_heatmap <- ggplot(sim_mat_long, aes(x = Net1, y = Net2, fill = Jaccard)) +
  #   geom_tile() +
  #   scale_fill_continuous_sequential(palette = "Viridis", name = "Jaccard Similarity") +
  #   labs(
  #     title = sprintf("Jaccard Similarity Heatmap for %s", bio),
  #     x = "Network ID", y = "Network ID"
  #   ) +
  #   theme_minimal() +
  #   theme(
  #     axis.text.x = element_text(angle = 75, hjust = 1, size = 3),
  #     axis.text.y = element_text(size = 3),
  #     panel.grid = element_blank()
  #   )
  # 
  # print(p_heatmap)



  ## ===== Collect per-network coordinates for the networks in top pairs ========
  # unique networks from the top pairs
  unique_nets <- unique(c(sim_top$Net1, sim_top$Net2))

  # subset interactions to the chosen bioregion and those networks
  dat_map <- dat_bio_sub[dat_bio_sub$Network_id %in% unique_nets, , drop = FALSE]

  if (!all(c("Latitude","Longitude") %in% names(dat_map))) {
    cat("Latitude/Longitude columns are missing in the dataset.\n")
    next
  }

  # Ensure numeric (safely)
  dat_map$Latitude  <- suppressWarnings(as.numeric(dat_map$Latitude))
  dat_map$Longitude <- suppressWarnings(as.numeric(dat_map$Longitude))

  # remove rows with missing coordinates
  dat_map <- dat_map[!is.na(dat_map$Latitude) & !is.na(dat_map$Longitude), , drop = FALSE]
  if (nrow(dat_map) == 0) {
    cat("No valid coordinates to plot.\n")
    next
  }

  ## For each Network_id, take a single representative lat/lon (first unique non-NA)
  first_unique <- function(x) {
    ux <- unique(x)
    ux[!is.na(ux)][1]
  }

  # get the location for each network 
  
 # aggregate coordinates per network 
  lat_by_net <- aggregate(Latitude  ~ Network_id, data = dat_map, FUN = first_unique)
  lon_by_net <- aggregate(Longitude ~ Network_id, data = dat_map, FUN = first_unique)


  # merge back
  net_locs <- merge(lat_by_net, lon_by_net, by = "Network_id", all = TRUE)

  # add total species to size points (we already computed 'counts' earlier)
  net_locs <- merge(net_locs, counts[, c("Network_id","total_species")], by = "Network_id", all.x = TRUE)

  # keep only networks we actually plot
  net_locs <- net_locs[net_locs$Network_id %in% unique_nets, , drop = FALSE]

  ## ======Determine map extents ========

  lat_range <- range(net_locs$Latitude,  na.rm = TRUE)
  lon_range <- range(net_locs$Longitude, na.rm = TRUE)
  lat_buffer <- 2
  lon_buffer <- 3
  xlim <- c(lon_range[1] - lon_buffer, lon_range[2] + lon_buffer)
  ylim <- c(lat_range[1] - lat_buffer, lat_range[2] + lat_buffer)

  ## ====== Base map (Europe) and plot ============

  europe <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf", continent = "Europe")

  # simple palette for distinct networks
  n_cols <- length(unique(net_locs$Network_id))
  cols <- grDevices::hcl.colors(n_cols, palette = "Spectral", rev = TRUE)

   p_map <- ggplot(europe) +
    geom_sf(fill = "beige", color = "black", linewidth = 0.2) +
    geom_point(
      data = net_locs,
      aes(x = Longitude, y = Latitude, color = Network_id, size = total_species),
      alpha = 0.8
    ) +
    scale_color_manual(values = cols, guide = "none") +
    scale_size_continuous(range = c(2, 8), name = "Total Species") +
    coord_sf(xlim = xlim, ylim = ylim, expand = FALSE, default_crs = NULL) +
    labs(
      title = sprintf("Top similar networks in %s ", bio),
      subtitle = sprintf("Points sized by total species",
                         lat_range[1], lat_range[2]),
      x = "Longitude", y = "Latitude", color = "Network_id"
    ) +
    theme_minimal() +
    theme(legend.position = "bottom")

  print(p_map)
}
## 
## === Processing Bioregion: Atlantic ===
## Number of networks for Atlantic: 215
## 
## === Top similar network pairs for Atlantic ===
##                                         Net1
##                                  MG11_E1_BC1
##                                  OS24_E2_BC1
##  Creehaun.Carron.Co.Clare_4.2018_GA1.GS1.WS1
##                                  CG43_E2_GA1
##                                   MT4_E1_BC1
##                           Chize_14_Grassland
##                                  MG18_E1_BC1
##                                  CG43_E1_GA1
##                           Chize_24_Grassland
##                        Chateau_Gaillard_2016
##                      Chize_12_Field_boundary
##                      Chize_17_Field_boundary
##                       Chize_5_Field_boundary
##                                  Larris_2016
##                       Chize_1_Field_boundary
##                       Chize_1_Field_boundary
##                                  CT32_E1_BC1
##                               Tibarney_3_GS1
##                         Carrownalassan_3_GS1
##                                  CG43_E1_GA1
##                                  Net2   Jaccard
##                           MG11_E2_BC1 0.6071429
##                           OS26_E1_BC1 0.5909091
##  Derreenatloghtan.Co.Clare_1.2017_GS1 0.5652174
##                           MG18_E2_BC1 0.5454545
##                            MT4_E2_BC1 0.5454545
##                    Chize_25_Grassland 0.5434783
##                           MG18_E2_BC1 0.5416667
##                           CG43_E2_GA1 0.5217391
##                    Chize_25_Grassland 0.5094340
##                         Falaises_2016 0.5086957
##               Chize_14_Field_boundary 0.5000000
##                Chize_5_Field_boundary 0.5000000
##                Chize_9_Field_boundary 0.5000000
##                             Riez_2016 0.4879518
##               Chize_19_Field_boundary 0.4791667
##                     Chize_1_Grassland 0.4782609
##                            MT1_E2_BC1 0.4782609
##                         Turrock_6_GS1 0.4666667
##                        Tibarney_3_GS1 0.4642857
##                           MG11_E1_BC1 0.4642857
## Warning in sprintf("Points sized by total species", lat_range[1],
## lat_range[2]): 2 arguments not used by format 'Points sized by total species'
## 
## === Processing Bioregion: Boreal ===
## Number of networks for Boreal: 276
## 
## === Top similar network pairs for Boreal ===
##      Net1     Net2   Jaccard
##  L23_2017 L23_2018 1.0000000
##  L35_2017 L35_2018 1.0000000
##  L39_2017 L39_2018 1.0000000
##  L46_2017 L46_2018 1.0000000
##  L13_2017 L13_2018 0.9729730
##  L27_2019 L27_2020 0.9642857
##  L28_2019 L28_2020 0.9500000
##  L13_2018 L13_2019 0.9473684
##  L28_2020 L28_2021 0.9473684
##  L17_2017 L17_2018 0.9354839
##  L17_2019 L17_2020 0.9354839
##  L23_2017 L23_2019 0.9333333
##  L23_2018 L23_2019 0.9333333
##  L12_2017 L12_2018 0.9310345
##  L13_2017 L13_2019 0.9230769
##  L45_2017 L45_2018 0.9166667
##  L31_2018 L31_2019 0.9130435
##  L31_2017 L31_2018 0.9090909
##  L35_2017 L35_2019 0.9090909
##  L35_2018 L35_2019 0.9090909
## Warning in sprintf("Points sized by total species", lat_range[1],
## lat_range[2]): 2 arguments not used by format 'Points sized by total species'

## 
## === Processing Bioregion: Continental ===
## Number of networks for Continental: 341
## 
## === Top similar network pairs for Continental ===
##          Net1         Net2   Jaccard
##  UNIPD03 CD23 UNIPD03 CD29 0.5733333
##  UNIPD03 CD15 UNIPD03 CD16 0.5376344
##  UNIPD03 CD27 UNIPD03 CD28 0.5301205
##  UNIPD03 CD15 UNIPD03 CD17 0.5268817
##  UNIPD03 CD20 UNIPD03 CD27 0.5256410
##  UNIPD03 CD35  UNIPD03 D34 0.5230769
##  UNIPD03 CD15 UNIPD03 CD22 0.5205479
##  UNIPD03 CD16 UNIPD03 CD17 0.5100000
##   UNIPD03 D17  UNIPD03 D29 0.5094340
##  UNIPD03 CD15 UNIPD03 CD28 0.5054945
##  UNIPD03 CD22 UNIPD03 CD23 0.5000000
##   UNIPD03 D23  UNIPD03 D24 0.5000000
##   UNIPD03 D24  UNIPD03 E35 0.5000000
##   UNIPD03 E19  UNIPD03 E36 0.5000000
##   UNIPD03 D29  UNIPD03 D35 0.4905660
##  UNIPD03 CD16 UNIPD03 CD28 0.4897959
##   UNIPD03 E14  UNIPD03 E26 0.4880952
##   UNIPD03 D11  UNIPD03 D24 0.4833333
##  UNIPD03 CD15 UNIPD03 CD20 0.4827586
##   UNIPD03 D24  UNIPD03 E31 0.4821429
## Warning in sprintf("Points sized by total species", lat_range[1],
## lat_range[2]): 2 arguments not used by format 'Points sized by total species'

## 
## === Processing Bioregion: Alpine ===
## Number of networks for Alpine: 111
## 
## === Top similar network pairs for Alpine ===
##  Net1 Net2   Jaccard
##  wmb4 wmb5 0.6440678
##    t1   t5 0.6136364
##  wmb1 wmb2 0.5974026
##   ho4 wmb4 0.5937500
##   ho4 wmb5 0.5781250
##    h1 wma2 0.5593220
##    h4  ho4 0.5555556
##    h3 wma2 0.5538462
##    h6   h7 0.5510204
##  wma2 wma3 0.5322581
##    h6  ho5 0.5306122
##    h4   h5 0.5303030
##    h7  ho5 0.5283019
##    h1 wmb2 0.5277778
##    h2 wma2 0.5270270
##    h1 wmb1 0.5217391
##    h7 wma5 0.5192308
##  wmb1 wmb3 0.5180723
##    h4 wmb5 0.5070423
##  wmb2 wmb3 0.5057471
## Warning in sprintf("Points sized by total species", lat_range[1],
## lat_range[2]): 2 arguments not used by format 'Points sized by total species'

## 
## === Processing Bioregion: Mediterranean ===
## Number of networks for Mediterranean: 99
## 
## === Top similar network pairs for Mediterranean ===
##                              Net1                             Net2   Jaccard
##    nevero_Encroached pasture_2011 penalara_Encroached pasture_2011 0.5520000
##    nevero_Encroached pasture_2011              nevero_Pasture_2011 0.5378151
##               nevero_Pasture_2011 penalara_Encroached pasture_2011 0.5217391
##  penalara_Encroached pasture_2011            penalara_Pasture_2011 0.5181818
##               nevero_Pasture_2011            penalara_Pasture_2011 0.5145631
##                            FRA1OP                           FRA2OP 0.5000000
##                             Canal                           Menajo 0.4800000
##                             Canal                            Pinar 0.4769231
##                            Menajo                        Redondela 0.4736842
##                            MED3CA                           MED4CA 0.4693878
##                            FRA2OP                           MIQ2OP 0.4651163
##                            BAT1CA                           BAT2CA 0.4561404
##                            Menajo                            Pinar 0.4545455
##                            MED2CA                           MED4CA 0.4500000
##                           Cartaya                            Pinar 0.4487179
##    nevero_Encroached pasture_2011            penalara_Pasture_2011 0.4462810
##                             Curva                            Pinar 0.4459459
##                           Cetrero                            Pinar 0.4457831
##                             Barca                            Pinar 0.4444444
##             Bois_de_Fontaret_2016                    Fourches_2016 0.4424242
## Warning in sprintf("Points sized by total species", lat_range[1],
## lat_range[2]): 2 arguments not used by format 'Points sized by total species'

## 
## === Processing Bioregion: Steppic ===
## Number of networks for Steppic: 1
## No valid Jaccard similarities for Steppic (empty or all NA).
## 
## === Processing Bioregion: Pannonian ===
## Number of networks for Pannonian: 4
## 
## === Top similar network pairs for Pannonian ===
##  Net1 Net2    Jaccard
##     2    5 0.14285714
##    17    5 0.07692308
##    49    5 0.04545455
##     2   49 0.02857143
##    17   49 0.02702703
##    17    2 0.00000000
## Warning in sprintf("Points sized by total species", lat_range[1],
## lat_range[2]): 2 arguments not used by format 'Points sized by total species'